library(RODBC)
library(knitr)
library(tidyverse)
knitr::opts_chunk$set(echo = FALSE,
                message = FALSE,
                warning = FALSE,
                error= TRUE)
startTime <- Sys.time()
connectionString <- getConnectionString(params)
run_query <- function(query_text, ...){
  result <- run_db_query(connectionString, query_text )
  return(result)
}

get_connection <- function(){
  connection <- get_new_connection(connectionString)
  return(connection)
}
tbresult <-runTableReplacements(connectionString)
QA_Alert_Messages <- data.frame('Alert Code'=character(),
                 'Alert Message'=character(), stringsAsFactors = FALSE)

CHORDS QA Report: VDW P3 Tables

The purpose of the data quality program is to conduct data quality checks that correspond to similar data checks carried out by PCORnet (Version 7). The checks cover 4 realms of data quality: data model conformance, data plausibility, data completeness, and data persistence. The program uses a series of SQL queries operationalized using RStudio to produce this report. The program is designed to support tables/figures that provide detailed information about each data quality check as well as an alert if a data error appears to be present (based on criteria developed by PCORnet or the CHORDS team).

This data quality report was generated from CHORDS r params$DBName.

Information about the QA program

Data Partner: Analyst: Query Run Date: r Sys.Date()

# Table to check for quality
tableCheckList <- c(lab_results,   encounters, prescribing,procedures  , social_history  , provider_specialty,diagnoses ,vital_signs  , census_location, demographics  , pro_surveys   ,  pro_questions ,pro_responses)
# table list in DB
sqlConnection <- get_connection()
dbTables <- sqlTables(get_connection())  
RODBC::odbcClose(sqlConnection)
tabledChecked <- vector(length=length(tableCheckList))
for (i in 1:length(tableCheckList)){
  tabledChecked[i] <- sum(grepl(tableCheckList[i], dbTables$TABLE_NAME, ignore.case = T))>0
}
missingTables <- setdiff(tableCheckList, tableCheckList[tabledChecked])
nonMissingTables <- setdiff(tableCheckList, missingTables)
nonMissingDBTables <- subset(dbTables, grepl(paste(tableCheckList, collapse = '|'), TABLE_NAME, ignore.case = T), select=TABLE_NAME)[,,drop=T]


missingTableMessage <- if(length(missingTables)==0) {
  "All tables accounted for."
} else{
    paste("The following tables are missing:", paste(missingTables, collapse = ','), sep = ' ')
}

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.01",missingTableMessage)
## Data Check 1.02: Required tables are not populated

table_N_lst <- list() #vector(length=length(nonMissingDBTables))
for(i in 1:length(nonMissingDBTables)){
  table_N_lst[[i]] <- run_query(paste("select distinct '_aaa_' as _aaa_, count(*) as tabN from ",nonMissingDBTables[i], sep=' '))#[,'tabN',drop=T]
 # print(dbTables[i])
  if (class(table_N_lst[[i]]) !='data.frame') {table_N_lst[[i]] <- NULL}
}
#length(dbTables)
table_N_lst <- do.call('rbind', table_N_lst) [,'tabN',drop=T]
nonPopTables <- nonMissingDBTables[table_N_lst==0]
nonPopMessage <- if(length(nonPopTables)==0) {"All tables populated, among those present."
} else{
    paste("The following tables exist, but are not populated:",  paste(nonPopTables, collapse = ','), sep = ' ')
}
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.02",nonPopMessage)
sqlFiles <- system.file('sql', package='chordsTables')
schema_validation_query <- read_file(dir(sqlFiles, "VDW_Schema_Validation.sql", f=TRUE))
connection103 <- get_connection()
schema_validation <- sqlQuery(connection103, schema_validation_query, as.is=TRUE)
odbcClose(connection103)

Data Check 1.03: Required fields are not present

## Data Check 1.03: Required fields are not present

missing_columns <- subset(schema_validation, Result == "TABLE/VIEW OR COLUMN MISSING")
missing_columnsTables <- select(missing_columns, c("TableName", "ColumnName"))
if (params$QAAlert == TRUE & nrow(missing_columnsTables) > 0){
  missing_columns$message <- paste0(missing_columns$TableName, ".", missing_columns$ColumnName, sep="")
  QA_Alert_Message_103_items <- paste0(missing_columns$message, collapse = ', ')
  QA_Alert_Message_103 <- paste0("The following required fields are not present: ", QA_Alert_Message_103_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.03",QA_Alert_Message_103)
}
misconfig <- subset(schema_validation, (Result == "CONFIG-MISMATCH" & ExpectedIsNullable == FoundIsNullable)) 
invalid_specs <- select(misconfig, c("TableName", "ColumnName", "ExpectedNumberPrecision", "FoundNumberPrecision", "ExpectedNumberScale", "FoundNumberScale", "ExpectedCharLength", "FoundCharLength", "ExpectedDatePrecision", "FoundDatePrecision"))
if (params$QAAlert == TRUE & nrow(invalid_specs) > 0){
  misconfig$message <- paste0(misconfig$TableName, ".", misconfig$ColumnName, sep="")
  QA_Alert_Message_104_items <- paste0(misconfig$message, collapse = ', ')
  QA_Alert_Message_104 <- paste0("The following required fields do not conform to data model specifications for data type, length, or name: ",
                                  QA_Alert_Message_104_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.04",QA_Alert_Message_104)
}
primary_key_data <- run_query("
/*TABLEREPLACEMENTINFILE*/
/*****************************************************************************
Script for validating the VDW Primary Keys.  It will create two lists.  One of
the Expected Tables and their primary keys and another of all the tables currently in
the partner VDW.  It will validate:
    - Compare the expected primary key to the primary key of each
    table in a partner's VDW
    - The number of records that currently violate the exepcted primary key
FOR DEVELOPERS:
To Update the script for partner use:
1) Edit the #PKVALIDATION table insertion if the primary keys of the VDW have
changed or add new entires for new tables.  One row per primary key column.
Examples below:
Output:
    One output table
    1)  Shows the Table name, the expected primary key, the found primary key,
    and the result of the comparision between the expected and found primary keys.
    KeyMatchResultValues:
        - OK: Expected primary key matches the found primary key
        - KEY MISMATCH: Expected Primary Key does not matc the found primary key
        - TABLE OR KEYS NOT FOUND: No table in the VDW matches the expected table
        - UNKNOWN: An unknow error occured
    ViolationCount:
        - The number of records that that appear to violate the expected primary key.
*****************************************************************************/
/*****************************************************************************
BEGIN TempTable Clearing and Creation
*****************************************************************************/
SET ANSI_NULLS ON;
SET QUOTED_IDENTIFIER ON;
SET NOCOUNT ON;
SET ANSI_WARNINGS OFF;
BEGIN

    IF OBJECT_ID('tempdb..#PKVALIDATION') IS NOT NULL
    BEGIN
       DROP TABLE #PKVALIDATION;
    END;
    CREATE TABLE #PKVALIDATION (
        Table_Name      VARCHAR(250) ,
        Column_Name     VARCHAR(250)
    );
    IF OBJECT_ID('tempdb..#KeyMatchResult') IS NOT NULL
    BEGIN
       DROP TABLE #KeyMatchResult;
    END;
    IF OBJECT_ID('tempdb..#PKVIOLATIONS') IS NOT NULL
    BEGIN
       DROP TABLE #PKVIOLATIONS;
    END;

    CREATE TABLE #PKVIOLATIONS (
        Table_Name      VARCHAR(250) ,
        Violation_Count BIGINT
    );
END;
/*****************************************************************************
END TempTable Clearing and Creation
*****************************************************************************/
/*****************************************************************************
BEGIN Table Create Section
*****************************************************************************/
INSERT INTO #PKVALIDATION
VALUES      (
       'CENSUS_DEMOG', 'CENSUS_YEAR'), (
       'CENSUS_DEMOG', 'GEOCODE'), (
       'EVERNDC', 'NDC'), (
       'EVERNDC', 'GENERIC'), (
       'PROVIDER_SPECIALTY', 'PROVIDER'), (
       'DEATH', 'PERSON_ID'), (
       'CAUSE_OF_DEATH', 'PERSON_ID'), (
       'CAUSE_OF_DEATH', 'COD'), (
       'DEMOGRAPHICS', 'PERSON_ID'), (
       'LINKAGE', 'LINK_ID'), (
       'LINKAGE', 'LINE'), (
       'BENEFIT', 'BENEFIT_ID'), (
       'ENCOUNTERS', 'ENC_ID'), (
       'DIAGNOSES', 'DIAGNOSES_ID'), (
       'ENROLLMENT', 'PERSON_ID'), (
       'ENROLLMENT', 'ENR_START'), (
       'LAB_RESULTS', 'LAB_RESULTS_ID'), (
       'PRO_SURVEYS', 'PRO_ID'), (
       'PRO_QUESTIONS', 'PRO_ID'), (
       'PRO_QUESTIONS', 'QUESTION_ID'), (
       'PRO_QUESTIONS', 'QUESTION_VER'), (
       'PRO_RESPONSES', 'RESPONSE_ID'), (
       'PHARMACY', 'PHARMACY_ID'), (
       'PRESCRIBING', 'PRESCRIBING_ID'), (
       'PROCEDURES', 'PROCEDURES_ID'), (
       'SOCIAL_HISTORY', 'SOCIAL_HISTORY_ID'), (
       'VITAL_SIGNS', 'VITAL_SIGNS_ID'), (
       'TUMOR', 'TUMOR_ID'), (
       'LANGUAGES', 'PERSON_ID'), (
       'LANGUAGES', 'LANG_ISO'), (
       'CENSUS_LOCATION', 'PERSON_ID'), (
       'CENSUS_LOCATION', 'LOC_START');
/*****************************************************************************
BEGIN Table Name Replacement: If a TableName replacement table exists, it will swap out the 
names in the tables for the correct ones based on how it's mapped in their table.
*****************************************************************************/
BEGIN
IF OBJECT_ID('CHORDS_TABLENAMES') IS NOT NULL
    BEGIN
        UPDATE a
               SET a.Table_Name = b.NEW_NAME
        FROM #PKVALIDATION a JOIN CHORDS_TABLENAMES b ON b.ORG_NAME = a.Table_Name;
    END;
END;
/*****************************************************************************
END Table Name Replacement
*****************************************************************************/
/*****************************************************************************
BEGIN Analysis Section: Compares the partner's primary keys to the expected keys
******************************************************************************/
SELECT * INTO #KeyMatchResult
FROM (
SELECT            
       ExpectKeys.TABLE_NAME, 
       ExpectKeys.COLUMN_NAMES AS Expected_Primary_Key, 
       CurrKeys.COLUMN_NAMES AS Found_Primary_Key,
       CASE
           WHEN ob.type = 'V'
               THEN 'VIEW FOUND'
           WHEN CurrKeys.TABLE_NAME IS NULL
           THEN 'TABLE/VIEW OR KEYS NOT FOUND'
           WHEN ExpectKeys.COLUMN_NAMES != CurrKeys.COLUMN_NAMES
           THEN 'KEY MISMATCH'
           WHEN ExpectKeys.COLUMN_NAMES = CurrKeys.COLUMN_NAMES
           THEN 'OK'
           ELSE 'UNKNOWN ERROR'
       END AS Key_Match_Result
FROM                  
(
    SELECT        
           TABLE_NAME, 
           LEFT(COL, LEN(COL) - 1) AS COLUMN_NAMES
    FROM              
    (
        SELECT DISTINCT    
               TAB.TABLE_NAME TABLE_NAME, 
        (
            SELECT 
                   COL.Column_Name + ', ' AS [text()]
            FROM    
                 #PKVALIDATION COL
            WHERE  COL.Table_Name = TAB.Table_Name
            ORDER BY 
                     COL.Column_Name FOR
            XML PATH('')
        ) COL
        FROM 
             #PKVALIDATION TAB
    ) T
    WHERE T.COL IS NOT NULL
) ExpectKeys
LEFT JOIN
(
    SELECT        
           TABLE_NAME, 
           LEFT(COL, LEN(COL) - 1) AS COLUMN_NAMES
    FROM              
    (
        SELECT DISTINCT    
               TAB.TABLE_NAME TABLE_NAME, 
        (
            SELECT 
                   COL.COLUMN_NAME + ', ' AS [text()]
            FROM    
                 INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE COL
            WHERE  COL.CONSTRAINT_NAME = TAB.CONSTRAINT_NAME
                   AND COL.TABLE_NAME = TAB.TABLE_NAME
                   AND CONSTRAINT_TYPE = 'PRIMARY KEY'
            ORDER BY 
                     COL.COLUMN_NAME FOR
            XML PATH('')
        ) COL
        FROM 
             INFORMATION_SCHEMA.TABLE_CONSTRAINTS TAB
    ) T
    WHERE T.COL IS NOT NULL
) CurrKeys
     ON CurrKeys.TABLE_NAME = ExpectKeys.TABLE_NAME
     LEFT JOIN sys.objects ob ON ob.object_id = OBJECT_ID(ExpectKeys.TABLE_NAME)
     and ob.type in ('U', 'V')) KeyMatch;
BEGIN

    DECLARE @SQL NVARCHAR(3000);
    DECLARE @Table_Name VARCHAR(100);
    DECLARE @Key_Columns VARCHAR(100);

    DECLARE CUR CURSOR
    FOR SELECT
              Table_Name, Expected_Primary_Key
        FROM
             #KeyMatchResult;

    OPEN CUR;

    FETCH NEXT FROM CUR INTO @Table_Name, @Key_Columns;

    WHILE @@FETCH_STATUS = 0
        BEGIN
        SET @SQL = '
            IF OBJECT_ID(''' + @Table_Name + ''') IS NOT NULL
            WITH CTEKEY
                AS (SELECT 
                    COUNT(*) KEYTOT
                    FROM (
                        SELECT DISTINCT  ' + @Key_Columns + '
                        FROM ' + @Table_Name + ') z),
            CTETOT
                AS (SELECT COUNT(*) TABTOT
                    FROM ' + @Table_Name + ')
            INSERT INTO #PKVIOLATIONS
            SELECT  ''' + @Table_Name + ''', CTETOT.TABTOT - CTEKEY.KEYTOT
            FROM CTETOT, CTEKEY;'
            EXEC Sp_executesql
                 @SQL;
            FETCH NEXT FROM CUR INTO @Table_Name, @Key_Columns;
        END;

    CLOSE CUR;

    DEALLOCATE CUR; 
END
SELECT 
       m.*, 
       v.Violation_Count
FROM   
     #KeyMatchResult m
     LEFT JOIN #PKVIOLATIONS v
          ON v.Table_Name = m.TABLE_NAME;
/*****************************************************************************
END Analysis Section
*****************************************************************************/")
violated_primary_keys <- subset(primary_key_data, Key_Match_Result != "OK")
if (params$QAAlert == TRUE & nrow(violated_primary_keys) > 0){
  violated_primary_keys_table <- subset(violated_primary_keys, (Violation_Count > 0 & Key_Match_Result == "VIEW FOUND") | (Key_Match_Result != "VIEW FOUND")) 
  violated_primary_keys_table$message <- paste0(violated_primary_keys_table$TABLE_NAME, sep="")
  QA_Alert_Message_105_items <- paste0(violated_primary_keys_table$message, collapse = ', ')
  QA_Alert_Message_105 <- paste0("The following tables have primary key definition errors:  ",
                                  QA_Alert_Message_105_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.05",QA_Alert_Message_105)
}
sqlFiles <- system.file('sql', package='chordsTables')
data_validation_query <- read_file(dir(sqlFiles, "VDW_DataValidation.sql", f=TRUE))
connection106 <- get_connection()
results <- sqlQuery(connection106, data_validation_query, as.is=TRUE)
data_validation <- sqlQuery(connection106, "
  SELECT
       *
  FROM
     #CHORDSDataValueResults
  ORDER BY TargetTable;
  ", as.is = TRUE)
ref_integrity <- sqlQuery(connection106, "
  SELECT
       *
  FROM
     #CHORDSReferentialIntegrityResults
  ORDER BY TargetTable;
  ", as.is = TRUE)
odbcClose(connection106)
if (params$QAAlert == TRUE & nrow(data_validation) > 0){
  data_validation_table <- data.frame(data_validation, stringsAsFactors = FALSE)
  data_validation_table$message <- paste0(data_validation_table$TargetTable, ".", data_validation_table$TargetColumn, sep="")
  QA_Alert_Message_106_items <- paste0(data_validation_table$message, collapse = ', ')
  QA_Alert_Message_106 <- paste0("The following required fields contain values outside of data model specifications: ",
                                  QA_Alert_Message_106_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.06", QA_Alert_Message_106)
}
nullableValues = subset(misconfig, (Result == "CONFIG-MISMATCH" & ExpectedIsNullable != FoundIsNullable))
nullableValues_Table <- select(nullableValues, c("TableName", "ColumnName", "ExpectedIsNullable", "FoundIsNullable"))
if (params$QAAlert == TRUE & nrow(nullableValues_Table) > 0){
  nullableValues_Table$message <- paste0(nullableValues_Table$TableName, ".", nullableValues_Table$ColumnName, sep="")
  QA_Alert_Message_107_items <- paste0(nullableValues_Table$message, collapse = ', ')
  QA_Alert_Message_107 <- paste0("The following required fields have non-permissible missing values: ",
                                  QA_Alert_Message_107_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.07",QA_Alert_Message_107)
}
orphan_person_ids <- subset(ref_integrity, ((TargetColumn == "PERSON_ID" | ReferenceColumn == "PERSON_ID") & ValuesNotFound > 0))
orphan_person_ids_table <- select(orphan_person_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing"))
if (params$QAAlert == TRUE & nrow(orphan_person_ids_table) > 0){
  orphan_person_ids_sub_table <- data.frame(orphan_person_ids_table, stringsAsFactors = FALSE)
  orphan_person_ids_sub_table$message <- paste0(orphan_person_ids_sub_table$TargetTable, sep="")
  QA_Alert_Message_108_items <- paste0(orphan_person_ids_sub_table$message, collapse = ', ')
  QA_Alert_Message_108 <- paste0("The following tables contain orphan PERSON_IDs: ",
                                  QA_Alert_Message_108_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.08",QA_Alert_Message_108)
}
orphan_enc_ids <- subset(ref_integrity, (TargetColumn == "ENC_ID" | ReferenceColumn == "ENC_ID") & ValuesNotFound > 0)
orphan_enc_ids_table <- select(orphan_enc_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing"))
if (params$QAAlert == TRUE & nrow(orphan_enc_ids_table) > 0){
  orphan_person_ids_sub_table <- data.frame(orphan_enc_ids_table, stringsAsFactors = FALSE)
  orphan_person_ids_sub_table$message <- paste0(orphan_person_ids_sub_table$TargetTable, sep="")
  QA_Alert_Message_109_items <- paste0(orphan_person_ids_sub_table$message, collapse = ', ')
  QA_Alert_Message_109 <- paste0("The following tables contain orphan ENCOUNTER_IDs: ",
                                  QA_Alert_Message_109_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.09",QA_Alert_Message_109)
}
repErrors <- run_query(
                      paste0(
                        "
select 'Diagnosis' as tablename, 
       count(*) as nrows,
       sum(case when a.person_id <> b.person_id then 1 else 0 end) as Person_ID,
       sum(case when a.adate <> b.adate then 1 else 0 end) as Adate,
       sum(case when a.enctype <> b.enctype then 1 else 0 end) as EncType
from ", encounters, " as a inner join (select distinct enc_id, person_id, adate, enctype from ", diagnoses, ") as b on a.enc_id = b.enc_id
union
select 'Procedure' as tablename, 
       count(*) as nrows,
       sum(case when a.person_id <> b.person_id then 1 else 0 end) as  Person,
       sum(case when a.adate <> b.adate then 1 else 0 end) as  Adate,
       sum(case when a.enctype <> b.enctype then 1 else 0 end) as  EncType
from ", encounters, " as a inner join (select distinct enc_id, person_id, adate, enctype from ", procedures, ") as b on a.enc_id = b.enc_id
                        "
                      )) 
repErrors_tx <- tidyr::gather(repErrors, 'field','n_bad',Person_ID, Adate, EncType) %>% arrange(tablename) %>%
  within({
    NP_bad<- paste0(n_bad,' (',round(100*n_bad/nrows, 2),')')
  })


QA_Alert_Message_1.10 <- if(nrow(subset(repErrors, n_bad>0))>0) {"There are Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables"} else {
  "There were no Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables found."
}
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.10",QA_Alert_Message_1.10)
GR1Person <- run_query(
                      paste0(
                        "
select 'Encounter' as tablename, count(*) as n_encid, 
       sum(case when N_person>1 then 1 else 0 end) as N_gr1
from ( select enc_id,
       count(distinct person_id) as N_person
       from ", encounters," 
       group by enc_id
) as qry
"
))
GR1Person <- within(GR1Person, {
  pct <- round(100*N_gr1/n_encid, 2)
})
#if(GR1Person$pct>5){
#  print(paste0("WARNING: The encounter table has more than 5% of encounters assigned to more than 1 person (",GR1Person$pct,"%)"))
#} else{
#  print(paste0("The encounter table has ", GR1Person$pct,"% of encounters assigned to more than 1 person"))
#}
encItoManyMsg <- if(GR1Person$pct>5){
  paste0("WARNING: The encounter table has more than 5% of encounters assigned to more than 1 person (",GR1Person$pct,"%)")
} else{
  paste0("The encounter table has ", GR1Person$pct,"% of encounters assigned to more than 1 person")
}

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.11",encItoManyMsg)
orphan_provider_ids <- subset(ref_integrity, ((TargetColumn %in% c("RXMD", "PROVIDER") | ReferenceColumn %in% c("RXMD", "PROVIDER"))) & ValuesNotFound > 0)
orphan_provider_ids_table <- select(orphan_provider_ids, c("TargetTable", "TargetColumn", "ReferenceTable", "ReferenceColumn", "ValuesNotFound", "TargetTableDistinctCount", "PercentOfDistinctTargetColumnMissing"))
if (params$QAAlert == TRUE & nrow(orphan_provider_ids_table) > 0){
  orphan_provider_ids_sub_table <- data.frame(orphan_provider_ids_table, stringsAsFactors = FALSE)
  orphan_provider_ids_sub_table$message <- paste0(orphan_provider_ids_sub_table$TargetTable, sep="")
  QA_Alert_Message_112_items <- paste0(orphan_provider_ids_sub_table$message, collapse = ', ')
  QA_Alert_Message_112 <- paste0("The following tables contain orphan PROVIDER_IDs: ",
                                  QA_Alert_Message_112_items)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.12",QA_Alert_Message_112)
}
connection113 <- get_connection()
code_conform_table_counts <- sqlQuery(connection113, paste0("
 /*
 - DIAGNOSES:
        *ICD09:length is not between 3-5 OR has alpha characters other than 
         E or V OR has no numeric characters OR first 3 digits (min length) are 0;
        *ICD10: length is not between 3 and 7 OR 1st character is not alpha OR 
         first 3 digits (min length) are 0 or 9 OR has no numeric characters;
 - PROCEDURES:
        *CPT/HCPCS are length <5 OR first 5 are all 0 or 9 OR no numeric;
        *ICD9 is not length 3 or 4 OR any alpha OR all 0;
        *ICD10 is not length 7 OR no numeric OR all 7 are 0s or 9s;
 - PRESCRIBING:
        *flag codes with any alphabetical characters OR a length <2 or >7;
 - PHARMACY:
        *length not 11 OR any alpha OR a string of 0 or 9;
 - LAB_RESULT
        *flag for any alphabetical characters OR a length less than 3 or greater 
         than 7 OR the absence of a dash after the next to last position;
*/
SET NOCOUNT ON;
DROP TABLE IF EXISTS 
     #VdwInvalidCodes;
SELECT        
       *
INTO 
     #VdwInvalidCodes
FROM              
(
    SELECT    
           a.TableName, 
           a.CodeType, 
           a.Code, 
           a.ValidResult
    FROM          
    (
        SELECT 
               'DIAGNOSES' AS TableName, 
               DX AS Code, 
               DX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(replace(DX, '.', '')) NOT BETWEEN 3 AND 7
                   THEN 'Code Length must be between 3 and 7 characters (excluding \".\")'
                   WHEN DX NOT LIKE '[A-TV-Z]%'
                   THEN 'Starting character must be a letter and not \"U\"'
                   WHEN SUBSTRING(DX, 1, 3) IN('000', '999')
                   THEN 'Invalid Numeric Value Range'
                   WHEN DX NOT LIKE '%[0-9]%'
                   THEN 'Values must include a numerical component'
                   ELSE 'OK'
               END AS ValidResult
        FROM ",    
             diagnoses, "
        WHERE   DX_CODETYPE = '10'
        UNION ALL
        SELECT 
               'DIAGNOSES' AS TableName, 
               DX AS Code, 
               DX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(replace(DX, '.', '')) NOT BETWEEN 3 AND 5
                   THEN 'Length must be between 3 and 5 characters (excluding \".\")'
                   WHEN(DX NOT LIKE '[EV]%'
                        AND DX NOT LIKE '[0-9]%')
                   THEN 'Starting Character must be \"E\", \"V\", or a number'
                   WHEN SUBSTRING(DX, 1, 3) IN('000')
                   THEN 'Invalid Numeric Value Range'
                   WHEN DX NOT LIKE '%[0-9]%'
                   THEN 'Numeric Values Not Detected'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             diagnoses, "
        WHERE   DX_CODETYPE = '09'
        UNION ALL
        SELECT 
               'PROCEDURES' AS TableName, 
               PX AS Code, 
               PX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(PX) < 5
                   THEN 'Code length must be less than 5 characters'
                   WHEN SUBSTRING(PX, 1, 5) IN('00000', '99999')
                   THEN 'Invalid Numeric Value'
                   WHEN TRY_PARSE(PX AS INT) IS NULL
                   THEN 'Non-numeric Characters Detected'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             procedures, "
        WHERE   PX_CODETYPE = 'C4'
        UNION ALL
        SELECT 
               'PROCEDURES' AS TableName, 
               PX AS Code, 
               PX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(replace(PX, '.', '')) NOT BETWEEN 3 AND 4
                   THEN 'Code length must be between 3 and 4 characters'
                   WHEN PX NOT LIKE '[0-9]%'
                   THEN 'Invalid Start Character'
                   WHEN PX IN('00000')
                   THEN 'Invalid Numeric Value'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",
             procedures, "
        WHERE   PX_CODETYPE = '09'
        UNION ALL
        SELECT 
               'PROCEDURES' AS TableName, 
               PX AS Code, 
               PX_CODETYPE AS CodeType,
               CASE
                   WHEN LEN(PX) != 7
                   THEN 'Code Length Must Equal 7'
                   WHEN PX NOT LIKE '%[0-9]%'
                   THEN 'Code values must include a numerical component'
                   WHEN PX IN('0000000', '9999999')
                   THEN 'Invalid Numeric Value'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             procedures,"
        WHERE   PX_CODETYPE = '10'
        UNION ALL
        SELECT 
               'PRESCRIBING' AS TableName, 
               RXNORM AS Code, 
               'RXNORM' AS CodeType,
               CASE
                   WHEN RXNORM LIKE '%[A-Z]%'
                   THEN 'Code cannot contain alphabetical characters'
                   WHEN LEN(RXNORM) NOT BETWEEN 2 AND 7
                   THEN 'Code length must be between 2 and 7 characters'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             prescribing, "
        UNION ALL
        SELECT 
               'PHARMACY' AS TableName, 
               NDC AS Code, 
               'NDC' AS CodeType,
               CASE
                   WHEN NDC LIKE '%[A-Z]%'
                   THEN 'Code cannot contain alphabetical characters'
                   WHEN LEN(NDC) != 11
                   THEN 'Code length must be 11 characters'
                   WHEN NDC IN('00000000000', '99999999999')
                   THEN 'Invalid Numeric Value'
                   ELSE 'OK'
               END AS ValidResult
        FROM  ",   
             pharmacy, "
        UNION ALL
        SELECT 
               'LAB_RESULTS' AS TableName, 
               LOINC AS Code, 
               'LOINC' AS CodeType,
               CASE
                   WHEN LOINC LIKE '%[A-Z]%'
                   THEN 'Code cannot contain alphabetical characters'
                   WHEN LEN(LOINC) NOT BETWEEN 3 AND 7
                   THEN 'Code length must be between 2 and 7 characters'
                   WHEN SUBSTRING(LEFT(REVERSE(RTRIM(LTRIM(LOINC))), 2), 2, 2) != '-'
                   THEN 'No hyphen character in the second to last position'
                   ELSE 'OK'
               END AS ValidResult
        FROM ",  
             lab_results, "
    ) a
    WHERE a.validResult != 'OK'
) InvalidCodes;
WITH CTE_CodeCounts
     AS (SELECT 
                'DIAGNOSES' AS TableName, 
                DX_CODETYPE AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              diagnoses, "
         GROUP BY 
                  DX_CODETYPE
         UNION ALL
         SELECT 
                'PROCEDURES' AS TableName, 
                PX_CodeType AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              procedures, " 
         GROUP BY 
                  PX_CodeType
         UNION ALL
         SELECT 
                'PRESCRIBING' AS TableName, 
                'RXNORM' AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              prescribing, "
         UNION ALL
         SELECT 
                'PHARMACY' AS TableName, 
                'NDC' AS CodeType, 
                COUNT(1) CountAll
         FROM ",    
              pharmacy, "
         UNION ALL
         SELECT 
                'LAB_RESULTS' AS TableName, 
                'LOINC' AS CodeType, 
                COUNT(1) CountAll
         FROM ",   
              lab_results, "
    )
     SELECT 
            b.TableName, 
            b.CodeType, 
            cc.CountAll, 
            b.CountInvalid AS CountInvalid, 
            ROUND(CAST(b.CountInvalid AS DECIMAL) / CAST(cc.CountAll AS DECIMAL) * CAST(100.0 AS DECIMAL), 2)
            AS PercentInvalid
     FROM       
     (
         SELECT 
                z.TableName, 
                z.CodeType, 
                COUNT(1) AS CountInvalid
         FROM   
              #VdwInvalidCodes z
         GROUP BY 
                  z.TableName, 
                  z.CodeType
     ) b
     JOIN CTE_CodeCounts cc
          ON cc.TableName = b.TableName
             AND cc.CodeType = b.CodeType
     ORDER BY 
              b.TableName, 
              b.CodeType;
"))
top_50_invalid <- sqlQuery(connection113, "
SELECT 
        TableName, 
    CodeType, 
    Code, 
    ValidResult, 
        CountInvalid
FROM       
(
    SELECT 
           TableName, 
           CodeType, 
           Code, 
           ValidResult, 
           CountInvalid, 
           ROW_NUMBER() OVER(PARTITION BY TableName
           ORDER BY 
                    CountInvalid DESC) RowNum
    FROM       
    (
        SELECT 
               z.TableName, 
               z.CodeType, 
               z.Code, 
               z.ValidResult, 
               COUNT(1) AS CountInvalid
        FROM   
             #VdwInvalidCodes z
        GROUP BY 
                 z.TableName, 
                 z.CodeType, 
                 z.Code, 
                 z.ValidResult
    ) InvalidCount
) Top50
WHERE  RowNum <= 50
ORDER BY TableName, CodeType, CountInvalid Desc;
")
odbcClose(connection113)
if (params$QAAlert == TRUE){
  code_conform_table_counts_table <- subset(code_conform_table_counts, PercentInvalid > 5.0)
  if (!rlang::is_empty(code_conform_table_counts_table)){
    code_conform_table_counts_table$message <- paste0(code_conform_table_counts_table$TableName, sep="")
    QA_Alert_Message_113_items <- paste0(code_conform_table_counts_table$message, collapse = ', ')
    QA_Alert_Message_113 <- paste0("More than 5% of ICD, CPT, LOINC, RXCUI, or NDC codes do not conform to the expected length or content: ",
                                  QA_Alert_Message_113_items)
    QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("1.13",QA_Alert_Message_113)
  }
}
knitr::kable(code_conform_table_counts, row.names = FALSE, col.names =c("Table Name", "Code Type", "Count All", "Count Invalid", "% Invalid"))

\newline

# DC 2.01
tabDates <- run_query("
SELECT DISTINCT 
       a.name AS tabname, 
       b.name AS colname
FROM   
     sys.objects a
     INNER JOIN sys.columns b
          ON a.object_id = b.object_id
WHERE  type IN('U', 'V')
ORDER BY 
         a.name;") %>% 
  subset(grepl('date',colname, ignore.case = T))

modDates <- run_query("select name, modify_date from sys.objects WHERE  type IN('U', 'V')")
futureDateCode <- merge(tabDates, modDates, by.x='tabname', by.y='name') %>%
  within({
    sqlcode <- paste0("select '",tabname, "' as tableName, '", colname,"' as dateName ,count(*) as nrows, sum(case when a.",colname,">b.modify_date then 1 else 0 end) as futureDate from ",
tabname, " as a , (select modify_date from sys.objects where type IN('U', 'V') AND name ='",tabname,"') as b"
)
  })
#test <- run_query(futureDateCode[1, "sqlcode"])
fdatelst <- list()
for (i in 1:nrow(futureDateCode)){
  fdatelst[[i]] <- run_query(futureDateCode[i, "sqlcode"])
}
fdatesdone <- do.call('rbind', fdatelst) %>%
  within(.,{
    pctFuture <- round(100*futureDate/nrows, 4)
  })
#params$QAAlert<-F
### add message if no table is a problem 
futureDataMsg <- NULL
futureDataMsg <- if(nrow(subset(fdatesdone, pctFuture>=5))>0  ){
paste("The follosing tables exceeded the 5% limit on future dates:", subset(fdatesdone, pctFuture>=5)$tableName, sep = ' ')
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0  ) {
                   "No table exceeds the 5% limit on future dates."
}  

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.01",futureDataMsg)
# select count(*) as personCount, sum(case when (minage not between 0 and 89) or (maxage not between 0 and 89) then 1 else 0 end ) as N_outOfRange
patAgeRange <- run_query(paste0("
select count(*) as nrows, 
       sum(case when (minage <0) or (maxage <0) then 1 else 0 end ) as N_low,
       sum(case when (minage >=90) or (maxage >=90) then 1 else 0 end ) as N_high
from
(
select distinct  b.*, datediff(YY, a.birth_date, b.first_dt) as minage,  datediff(YY, a.birth_date, b.last_dt) as maxage 
from ",demographics, " as a inner join 
  (select person_id, min(adate) as first_dt, max(adate) as last_dt from ",encounters, "  group by person_id  ) as b on a.person_id = b.person_id    
where a.birth_date is not null
) as qry
                        "))
htOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when ht <0.0 then 1 else 0 end ) as N_low,
       sum(case when ht>=95.0 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where ht is not null
                             "))
wtOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when wt <0.0 then 1 else 0 end ) as N_low,
       sum(case when wt>350 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where wt is not null
                             "))
dbpOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when diastolic <40 then 1 else 0 end ) as N_low,
       sum(case when diastolic>120 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where diastolic is not null
                             "))
sbpOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when systolic <40 then 1 else 0 end ) as N_low,
       sum(case when systolic>210 then 1 else 0 end ) as N_high
from ", vital_signs, " 
 where systolic is not null
                             "))

daysSupplyOutOfRange <- run_query(paste0("

select count(*) as nrows,
       sum(case when rx_days_supply <1 then 1 else 0 end ) as N_low,
       sum(case when rx_days_supply>90 then 1 else 0 end ) as N_high
from ", prescribing, " 
 where rx_days_supply is not null
                             "))
allOutOfRange <- 
  rbind(
    cbind(table='Demographic/Encounter', item='Age (people)', low='< 0 yrs',high='> 89 yrs', patAgeRange),
    cbind(table='Vital_signs',item='Height (records)',low= '< 0 inches',high='> 0 inches', htOutOfRange),
    cbind(table='Vital_signs',item='Weight (records)',low= '< 0 pounds',high='> 350 pounds', wtOutOfRange),
    cbind(table='Vital_signs',item='Diastolic BP (records)',low= '< 40 mgHg',high='> 120 mgHg', dbpOutOfRange),
    cbind(table='Vital_signs',item='Systolic BP (records)',low= '< 40 mgHg',high='> 210 mgHg', sbpOutOfRange),
    cbind(table='Prescribing',item='Prescribed days supply (records)',low= '< 1 day',high='> 90 days', daysSupplyOutOfRange)
)
allOutOfRange <- within(allOutOfRange, {
  NP_low = paste0(N_low,' (',round(100*N_low/nrows, 2),')')
  NP_high = paste0(N_high,' (',round(100*N_high/nrows, 2),')')
})
#knitr::kable(
#subset(allOutOfRange, select=c(table,item, low, high, nrows, NP_low, NP_high)),
#col.names = c('Table','Field','Check Low','High','N','Low values N (%)','High values N (%)')
#)

outOfRangeMsg <- NULL
outOfRangeMsg <- if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))>0 ){

  paste('More than 10% of records fall into the lowest or highest categories of age, height, weight, diastolic,  blood pressure, systolic blood pressure, or dispensed days supply:', subset(allOutOfRange, (N_low+N_high)/nrows>=0.1)$item, sep=' ')
} else if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))==0 ) {
                   "No table exceeds the 5% limit."
}
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.02",futureDataMsg)
encTabBirth2.03 <- run_query(paste0("
 select sum(dobPostAdate) as dobPostAdate,
         sum(dobPostDdate) as dobPostDdate
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.adate)<0 and b.adate is not null then 1 else 0 end) as dobPostAdate,
                              max(case when datediff(day, a.birth_Date, b.ddate)<0 and b.ddate is not null then 1 else 0 end) as dobPostDdate
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", encounters, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
encTabDeath2.03 <- run_query(paste0("
 select sum(deathPreAdate) as deathPreAdate,
         sum(deathPreDdate) as deathPreDdate
  from (
                      select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.adate)>0 and b.adate is not null then 1 else 0 end) as deathPreAdate,
                              max(case when datediff(day, a.deathdt, b.ddate)>0 and b.ddate is not null then 1 else 0 end) as deathPreDdate
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", encounters, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
procTabBirth2.03 <- run_query(paste0("
 select sum(dobPostProc) as dobPostProc 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, procdate)<0 and b.procdate is not null then 1 else 0 end) as dobPostProc
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", procedures, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
procTabDeath2.03 <- run_query(paste0("
 select sum(deathPreProc) as deathPreProc
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.procdate)>0 and b.procdate is not null then 1 else 0 end) as deathPreProc
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", procedures, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
vitalsTabBirth2.03 <- run_query(paste0("
 select sum(dobPostMeasure) as dobPostMeasure 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.measure_date)<0 and b.measure_date is not null then 1 else 0 end) as dobPostMeasure
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", vital_signs, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
vitalsTabDeath2.03 <- run_query(paste0("
 select sum(deathPreMeasure) as deathPreMeasure
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.measure_date)>0 and b.measure_date is not null then 1 else 0 end) as deathPreMeasure
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", vital_signs, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
pharmTabBirth2.03 <- run_query(paste0("
 select sum(dobPostRx) as dobPostRx 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.rxdate)<0 and b.rxdate is not null then 1 else 0 end) as dobPostRx
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", pharmacy, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
pharmTabDeath2.03 <- run_query(paste0("
 select sum(deathPreRx) as deathPreRx
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.rxdate)>0 and b.rxdate is not null then 1 else 0 end) as deathPreRx
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", pharmacy, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )

prescribTabBirth2.03 <- run_query(paste0("
 select sum(dobPostRxStart) as dobPostRxStart 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.rx_start_date)<0 and b.rx_start_date is not null then 1 else 0 end) as dobPostRxStart
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", prescribing, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
prescribTabDeath2.03 <- run_query(paste0("
 select sum(deathPreRxStart) as deathPreRxStart
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.rx_start_date)>0 and b.rx_start_date is not null then 1 else 0 end) as deathPreRxStart
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", prescribing , " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
labTabBirth2.03 <- run_query(paste0("
 select sum(dobPostResult) as dobPostResult 
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.birth_Date, b.result_dt)<0 and b.result_dt is not null then 1 else 0 end) as dobPostResult
                       from (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as a inner join
                            ", lab_results, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
labTabDeath2.03 <- run_query(paste0("
 select sum(deathPreResult) as deathPreResult
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.result_dt)>0 and b.result_dt is not null then 1 else 0 end) as deathPreResult
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                            ", lab_results, " as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
DeathBeforeBirth <- run_query(paste0("
 select sum(deathPreBirth) as deathPreBirth
  from (
                       select a.person_id, 
                              max(case when datediff(day, a.deathdt, b.birth_date)>0  then 1 else 0 end) as deathPreBirth
                       from (select distinct person_id, deathdt from ",death, " where deathdt is not null) as a inner join
                             (select distinct person_id, birth_date from ",demographics, " where birth_date is not null) as b on a.person_id = b.person_id group by a.person_id
) qry


                       ") )
patsInEncTab <- run_query( paste0("select count(distinct person_id) as n from ",encounters))
table2.03 <- cbind(
encTabBirth2.03,
encTabDeath2.03,
procTabBirth2.03,
procTabDeath2.03,
vitalsTabBirth2.03,
vitalsTabDeath2.03,
pharmTabBirth2.03,
pharmTabDeath2.03,
prescribTabBirth2.03,
prescribTabDeath2.03,
labTabBirth2.03,
labTabDeath2.03,
DeathBeforeBirth
)
labels2.03 <- data.frame(name=c( "dobPostAdate","dobPostDdate","deathPreAdate","deathPreDdate","dobPostProc","deathPreProc","dobPostMeasure","deathPreMeasure","dobPostRx","deathPreRx","dobPostRxStart","deathPreRxStart","dobPostResult","deathPreResult","deathPreBirth" ),
                        newlabel = c('Adate<birth_date','Ddate<birth_date','Deathdt<adate','Deathdt<ddate','Procdate<birth_date','ProcDate>deathdt','Measure_date<birth_date','Measure_date>deathdt','rxdate<birth_date','rxdate>deathdt','rx_start_dt<birth_date','rx_start_dt>deathdt','Result_date<birth_date','Result_date>deathdt','Deathdt<birth_date')   ,
                        srctab = c('Encounters','Encounters','Encounters','Encounters','Procedures','Procedures','Vital_signs','Vital_signs','Pharmacy','Pharmacy','Prescribing','Prescribing','lab_results','lab_results','Demographics and death')
)

table2.03_tx <- tidyr::gather(table2.03, "comp", 'n') %>%
  within({
    pct <- round(100*n/patsInEncTab$n, 2)
  }) 
table2.03_tx$ord <- 1:nrow(table2.03_tx)
table2.03_tx <- table2.03_tx%>%
  merge(labels2.03, by.x='comp',by.y='name', all.x=T) %>%
  arrange(ord)
#knitr::kable(table2.03_tx[,c('ord','comp','n','pct', 'srctab')], col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table'))
#params$QAAlert <- T

illogicalDatesMsg <- NULL
illogicalDatesMsg <- if(nrow(subset(table2.03_tx, pct>=5))>0){

  paste("More than 5% of patients have illogical date relationships:", subset(table2.03_tx, pct>=5)$comp, sep=' ')
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0 ) {
                   "No table exceeds the 5% limit on illogical dates."
} 

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("2.03",illogicalDatesMsg)
# to implement a new check add a new row to the following table:
# table = the standard name of the table in the database (it will automaticaly correct for you if the actual table names are not standard)
# var = the name of the variable to be chacked
# condition = a valid logical condition ( in MS sql server language) which identifies a bad value
# where = a valid where clause to subset the table, be sure to start with "WHERE", see example for ddate
check3.03 <-
rbind( 
  data.frame(table='demographics',   var='birth_date',            condition='is null', where=' '),
  data.frame(table='demographics',   var='gender',                condition="not in ('M','F','O')", where=' '),
  data.frame(table='encounters',     var='discharge_disposition', condition="not in ('A','E')", where=' '),
  data.frame(table='encounters',     var='ddate',                 condition="is null", where="where enctype = 'IP'"),
  data.frame(table='procedures',     var='procdate',              condition='is null', where=' '),
  data.frame(table='prescribing',    var='rx_order_date',         condition='is null', where=' '),
  data.frame(table='pharmacy',       var='not(rxsup>1 or rxamt>1)',       condition=' ', where=' '), # compicated by it depending on two variables
  #data.frame(table='cause_of_death', var='source',                condition="not in ('S', 'N', 'T', 'B', 'L', 'U', 'O')"),
  data.frame(table='diagnoses',      var='dx_origin',             condition="not in ('OD', 'BI', 'CL', 'PR', 'NI', 'OT')", where=' '),
  data.frame(table='diagnoses',      var='enc_id',                condition="is null", where=' '),
  data.frame(table='procedures',     var='enc_id',                condition="is null", where=' '),
  data.frame(table='vital_signs',    var='enc_id',                condition="is null", where=' ')

)
res3.03 <- list()
for(i in 1:nrow(check3.03)){


  res3.03[[i]] <- with(check3.03, {run_query( 
                                            paste("select '",table[i],"' as dataTable,'",var[i],"' as variable, count(*) as N_rows, sum(case when ",var[i],condition[i],"then 1 else 0 end) as N_bad",
                                                  "from",eval(as.name(tolower(table[i]))), where[i]
                                                  ,
                                                  sep=' '
                                                  )
  )}
  )
}
result3.03 <- do.call('rbind', res3.03) %>%
  within(.,{
    pctBad <- round(100*N_bad/N_rows, 2)
  })
check3.03Msg <- NULL
check3.03Msg <- if(nrow(subset(result3.03, pctBad>=10))>0){
  paste("More than 10% of records have missing or unknown values for the following fields Alert:",subset(result3.03, pctBad>=10)$variable , sep = ' ' )
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0 ) {
                   "No check exceeds the 10% limit on bad values."
}

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.03",check3.03Msg)
patsWithEncs <- run_query(paste0("select count(*) as nrows, sum(patsWEncDiag) as patsWEncDiag, sum(patsWEncProc) as patsWEncProc from ( select  case when person_id in (select person_id from ", diagnoses, ") then 1 else 0 end as patsWEncDiag",
 ",case when person_id in (select person_id from ", procedures, ") then 1 else 0 end as patsWEncProc",
 "       from ", demographics, " where person_id in (select person_id from ", encounters, ") ) as q") )
patsWithEncs <- within(patsWithEncs, {
  pctWEncDiag <- 100* patsWEncDiag/nrows
  pctWEncproc <- 100* patsWEncProc/nrows
})
patDiagMsg <- with (patsWithEncs,{
  if(pctWEncDiag<50 ){
  paste0('WARNING: Only ',round(pctWEncDiag,1),'% of patients with encounters have diagnoses') 
} else { 
  paste0('NOTE: ',round(pctWEncDiag,1),'% of patients with encounters have diagnoses') 
}
})
patProcMsg <- with (patsWithEncs,{
  if(pctWEncproc<50 ){
  paste0('WARNING: Only ',round(pctWEncproc,1),'% of patients with encounters have procedures') 
} else { 
  paste0('NOTE: ',round(pctWEncproc,1),'% of patients with encounters have procedures') 
}
})

  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.04",patProcMsg)
  QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.05",patDiagMsg)
ipedei_no_principal_diag <- run_query(paste0("SELECT        
       InpatientEnc, 
       InpatientEnc - InpatientEncWPrincipalDiag AS InpatientEncWOPrincipalDiag, 
       CAST(ROUND((1.0 - CAST(InpatientEncWPrincipalDiag AS FLOAT) / CAST(InpatientEnc AS FLOAT)) *
       100.0, 2) as DECIMAL(5,2)) AS InpatientEncWOPrincipalDiagPcnt
FROM              
(
    SELECT    
           COUNT(e.ENC_ID) AS InpatientEnc, 
           SUM(IIF(d.ENC_ID IS NOT NULL, 1, 0)) AS InpatientEncWPrincipalDiag
    FROM ",     
         encounters ," e
         LEFT JOIN
    (
        SELECT DISTINCT 
               ENC_ID
        FROM  ", 
             diagnoses, " d
        WHERE  d.PRINCIPAL_DX = 'P'
               AND d.DX_ORIGIN != 'PR'
    ) d
              ON d.ENC_ID = e.ENC_ID
    WHERE e.ENCTYPE IN('IP', 'EI')
) PxDiags;"))
if (params$QAAlert == TRUE){
  ipedei_no_principal_diag_table <- subset(ipedei_no_principal_diag, InpatientEncWOPrincipalDiagPcnt > 10.0)
  if (!rlang::is_empty(ipedei_no_principal_diag_table) & nrow(ipedei_no_principal_diag_table) > 0){
    QA_Alert_Message_306 <- paste0("More than 10% of IP (inpatient) encounters with any diagnosis don't have a principal diagnosis")
    QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.06",QA_Alert_Message_306)
  }
}
benchmark_start <- run_query(paste0("
  SET NOCOUNT ON;
  DECLARE @BenchmarkStartDate DATE;
  SET @BenchmarkStartDate =
  (
      SELECT 
             IIF(MAX(x.ADATE) <= GETDATE(), MAX(x.ADATE), GETDATE())
      FROM ",   
           encounters, " x
  );
  SELECT @BenchmarkStartDate;
"))
data_result_75_complete <- run_query(str_replace_all(paste0("
  SET NOCOUNT ON;
  DECLARE @BenchmarkPriorYearAvg INT;
  SET @BenchmarkPriorYearAvg =
  (
      SELECT 
             COUNT(1) / 12
      FROM  ", 
           encounters, " e
      WHERE  e.ENCTYPE IN('IP', 'AV', 'ED', 'EI')
      AND e.ADATE >= DATEADD(MONTH, -24, @BenchmarkStartDate)
      AND e.ADATE <= DATEADD(MONTH, -12, @BenchmarkStartDate)
  );

  SELECT 
         FORMAT(ADATE, 'MM-MMM') Month, 
         @BenchmarkPriorYearAvg AS BenchmarkCount, 
         COUNT(1) PriorMonthCount, 
         CAST(ROUND((CAST(COUNT(1) AS FLOAT) / @BenchmarkPriorYearAvg * 100.0), 2) as DECIMAL(5,2))  AS PercentofBenchMark
  FROM ",  
       encounters, " e
  WHERE  e.ENCTYPE IN('IP', 'AV', 'ED', 'EI')
  AND e.ADATE >= DATEADD(MONTH, -12, @BenchmarkStartDate)
  GROUP BY 
           FORMAT(ADATE, 'MM-MMM')
  ORDER BY 
           FORMAT(ADATE, 'MM-MMM');                          

"), "@BenchmarkStartDate", paste0('\'', benchmark_start[1,1], '\'', sep='')))
data_result_75_complete$PercentofBenchMark <- as.numeric(as.character(data_result_75_complete$PercentofBenchMark)) 

QA Alerts

knitr::kable(QA_Alert_Messages, row.names = FALSE)

Data Check 1.03: Required fields are not present

knitr::kable(missing_columnsTables, row.names = FALSE, col.names = c("Table Name", "Column Name"))

Data Check 1.04: Required fields do not conform to data model specifications for data type, length, or name

knitr::kable(invalid_specs, row.names = FALSE, col.names = c("Table Name", "Column Name", "Expected Number Precision", "Found Number Precision", "Expected Number Scale", "Found Number Scale", "Expected Char Length", "Found Char Length", "Expected Date Precision", "Found Date Precision"))

Data Check 1.05: Tables have primary key definition errors

knitr::kable(violated_primary_keys, row.names = FALSE)

Data Check 1.06: Required fields contain values outside of data model specifications

knitr::kable(data_validation)

Data Check 1.07: Required fields have non-permissible missing values

knitr::kable(nullableValues_Table, row.names = FALSE, col.names = c("Table Name", "Column Name", "Expected Field Allows-Null Value", "Found Field Allows -Null Value"))

Data Check 1.08: Tables contain orphan PERSON_IDs

knitr::kable(orphan_person_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"))

Data Check 1.09: Tables contain orphan ENCOUNTER_IDs

knitr::kable(orphan_enc_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"))

Data Check 1.10: Replication errors between the ENCOUNTER, PROCEDURES and DIAGNOSIS tables

knitr::kable(subset(repErrors_tx, select=c(tablename, nrows, field, NP_bad)),
             col.names = c('Table','N','Field','Errors N(%)'))

Data Check 1.12: Tables contain orphan PROVIDER_IDs

knitr::kable(orphan_provider_ids_table, row.names = FALSE, col.names = c("Target Table", "Target Column", "Reference Table", "Reference Column", "Count of Values Not Found", "Distinct Total in Target Table", "Percentage of Missing to Distinct Total"))

Data Check 1.13: More than 5% of ICD, CPT, LOINC, RXCUI, or NDC codes do not conform to the expected length or content

cat("Top 50 Invalid Codes")
knitr::kable(top_50_invalid, row.names = FALSE, col.names = c("Table Name", "Code Type", "Code", "Valid Result", "Count Invalid"))

Data Check 2.01: More than 5% of records have future dates

  futureDataMsg <- if(nrow(subset(fdatesdone, pctFuture>=5))>0 & params$QAAlert==F){
                 knitr::kable(subset(fdatesdone, pctFuture>=5), col.names = c('Table','Date name','Total rows','N future dates','%'))
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0 & params$QAAlert==F) {
                   "No table exceeds the 5% limit on future dates."
} else if ( params$QAAlert==T) {
                   knitr::kable(fdatesdone, col.names = c('Table','Date name','Total rows','N future dates','%'))
}

futureDataMsg

Data Check 2.02: More than 10% of records fall into the lowest or highest categories of age, height, weight, diastolic

blood pressure, systolic blood pressure, or dispensed days supply

outOfRangeMsg <- NULL
outOfRangeMsg <- if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))>0 & params$QAAlert==F){
   knitr::kable(
   subset(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1), select=c(table,item, low, high, nrows, NP_low, NP_high)),
   col.names = c('Table','Field','Check Low','High','N','Low values N (%)','High values N (%)')
)
} else if(nrow(subset(allOutOfRange, (N_low+N_high)/nrows>=0.1))==0 & params$QAAlert==F) {
                   "No table exceeds the 5% limit."
} else if ( params$QAAlert==T) {
  knitr::kable(
  subset(allOutOfRange, select=c(table,item, low, high, nrows, NP_low, NP_high)),
  col.names = c('Table','Field','Check Low','High','N','Low values N (%)','High values N (%)')
)
}
outOfRangeMsg

Data Check 2.03: More than 5% of patients have illogical date relationships

illogicalDatesMsg <- NULL
illogicalDatesMsg <- if(nrow(subset(table2.03_tx, pct>=5))>0 & params$QAAlert==F){
        knitr::kable(subset(table2.03_tx, pct>=5)[,c('ord','comp','n','pct', 'srctab')], 
                    col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table'))
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0 & params$QAAlert==F) {
                   "No table exceeds the 5% limit on illogical dates."
} else if ( params$QAAlert==T) {
        knitr::kable(table2.03_tx[,c('ord','comp','n','pct', 'srctab')], 
                    col.names = c(' ','Date comparison','N Patients','% of unique patients in the encounter table','Source table'))
                 }

Data Check 3.03: More than 10% of records have missing or unknown values for the following fields:

  check3.03Msg <- NULL
check3.03Msg <- if(nrow(subset(result3.03, pctBad>=10))>0 & params$QAAlert==F){
        knitr::kable(subset(result3.03, pctBad>=10)[,c('dataTable','variable','N_rows','N_bad', 'pctBad')], 
                    col.names = c('Table','Variable','N Rows','N with bad values','%'))
} else if(nrow(subset(fdatesdone, pctFuture>=5))==0 & params$QAAlert==F) {
                   "No check exceeds the 10% limit on bad values."
} else if ( params$QAAlert==T) {
        knitr::kable(result3.03[,c('dataTable','variable','N_rows','N_bad', 'pctBad')], 
                    col.names = c('Table','Variable','N Rows','N with bad values','%'))}

Data Check 3.06: More than 10% of IP (inpatient) or ED to inpatient (EI) encounters with any diagnosis don't have a principal diagnosis

knitr::kable(ipedei_no_principal_diag, row.names = FALSE, col.names = c("In-Patient Type Encounters", "In-Patient Encounters W/O Principal Diagnoses", "% of In-Patient Type Encounters W/O Principal Diagnoses"))

Data Check 3.07: Encounters, diagnoses, or procedures in an ambulatory (AV), emergency department (ED), ED to inpatient (EI), or inpatient (IP) setting are less than 75% complete three months prior to the current month

cat(paste0('Benchmark Start Date: ', benchmark_start[1,1]))
if (params$QAAlert == TRUE){
  data_result_75_complete_table <- subset(data_result_75_complete, PercentofBenchMark < 75 )
  if (!rlang::is_empty(data_result_75_complete_table) & nrow(data_result_75_complete_table) > 0){
    QA_Alert_Message_307 <- paste0("Encounters, diagnoses, or procedures in an ambulatory (AV), emergency department (ED), or inpatient (IP) setting are less than 75% complete three months prior to the current month")
    QA_Alert_Messages[nrow(QA_Alert_Messages) + 1,] = c("3.07",QA_Alert_Message_307)
  }
}
knitr::kable(data_result_75_complete,row.names = FALSE, col.names = c("Month", "Benchnmark Count", "Prior Month Count", "% of Benchmark"))

Total program run time:

# close odbc
odbcCloseAll()
endTime <- Sys.time()
runtime <- endTime - startTime

Query run time = r runtime minutes



UCCC/CHORDS-QA documentation built on July 18, 2021, 6:39 a.m.